In [1]:
# Import everything we need
import pandas as pd
import numpy as np
# Set Pandas display options so we can see more data
pd.set_option('display.width', 1000)
In [2]:
# Load the dataset
tlo_data_file = 'data/tlo_checks_07.28.15_cleaned.csv'
# Load the dataset into a pandas dataframe
raw_data = pd.DataFrame.from_csv(tlo_data_file,
header=0,
sep=',',
index_col=0,
parse_dates=True,
encoding=None,
tupleize_cols=False,
infer_datetime_format=True)
raw_data.head()
Out[2]:
In [3]:
# Lowercase the text fields
raw_data['failure_explanation'] = raw_data['failure_explanation'].str.lower()
In [5]:
# Failure Explanations: 'dob', 'name', 'ssn dob name', 'ssn', 'ssn name', 'ssn dob','dob name', nan
def update_failure_explanations(type):
if type == 'dob':
return 0
elif type == 'name':
return 1
elif type == 'ssn dob name':
return 2
elif type == 'ssn':
return 3
elif type == 'ssn name':
return 4
elif type == 'ssn dob':
return 5
elif type == 'dob name':
return 6
In [6]:
raw_data['failure_explanation'] = raw_data['failure_explanation'].apply(update_failure_explanations)
raw_data.head()
Out[6]:
In [7]:
# Handle missing values
raw_data.fillna(0, inplace=True)
raw_data.head()
Out[7]:
In [8]:
# Create two matrices for our model to use
tlo_data = raw_data.iloc[:,0:22].values
tlo_targets = raw_data['verified'].values
In [9]:
tlo_data
Out[9]:
In [10]:
from sklearn import linear_model
logClassifier = linear_model.LogisticRegression(C=1, random_state=111)
In [11]:
from sklearn import cross_validation
X_train, X_test, y_train, y_test = cross_validation.train_test_split(tlo_data, tlo_targets, test_size=0.20, random_state=111)
logClassifier.fit(X_train, y_train)
Out[11]:
In [12]:
# Run the test data
predicted = logClassifier.predict(X_test)
predicted
Out[12]:
In [13]:
# Evaluate the model
from sklearn import metrics
metrics.accuracy_score(y_test, predicted)
Out[13]:
In [14]:
# Confusion matrix
metrics.confusion_matrix(y_test, predicted)
Out[14]:
In [15]:
import pickle
tlo_classifier_file = "models/tlo_lr_classifier_02.18.16.dat"
pickle.dump(logClassifier, open(tlo_classifier_file, "wb"))
In [16]:
# Recreate it as a test
logClassifier2 = pickle.load(open(tlo_classifier_file, "rb"))
print(logClassifier2)
In [ ]: